import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
data=pd.read_csv('KAG_energydata_complete.csv')
data.head()
As the description for our data is not very much clear so, I have summarised the explanation of columns here as per my understanding.
data.info()
print('The number of rows in dataset is - ' , data.shape[0])
print('The number of columns in dataset is - ' , data.shape[1])
data.isnull().sum().sort_values(ascending = True)
As we can see there are no null values in the data.
data.describe()
from sklearn.model_selection import train_test_split
train,test=train_test_split(data,test_size=0.25,random_state=40)
print('Shape of Train Set - ' , train.shape)
print('Shape of Test Set - ' , test.shape)
# Dividing the columns based on type for clear column management
col_temp = ["T1","T2","T3","T4","T5","T6","T7","T8","T9"]
col_hum = ["RH_1","RH_2","RH_3","RH_4","RH_5","RH_6","RH_7","RH_8","RH_9"]
col_weather = ["T_out", "Tdewpoint","RH_out","Press_mm_hg",
"Windspeed","Visibility"]
col_light = ["lights"]
col_randoms = ["rv1", "rv2"]
col_target = ["Appliances"]
# Seperate dependent and independent variables
feature_vars = train[col_temp + col_hum + col_weather + col_light + col_randoms ]
target_vars = train[col_target]
train[col_temp].describe()
train[col_hum].describe()
train[col_weather + col_light + col_randoms ].describe()
We can see that light columns has large amount of value as zero, still to confirm we will see the distribution of values.
# Check the distribution of values in lights column
feature_vars.lights.value_counts()
target_vars.describe()
1) Temperature columns-Temperature inside the house varies between 14.89 Deg & 29.85 Deg ,temperatire outside(T6)aries between -6.06 Degcto 28.29 Deg.
2) Humidiy columns - Humidity inside house varies is between 20.60% to 63.36% with exception of RH_5 (Bathroom) and RH_6 (Outside house) which varies between 29.82% to 96.32% and 1% to 99.9% respectively.
3) Appliances - 75% of Appliance consumption is less than 100 Wh . With the maximum consumption of 1080 Wh , there will be outliers in this column and there are small number of cases where consumption is very high
4) Lights column - Intially I believed lights column will be able to give useful information . With 11438 0 (zero) enteries in 14801 rows , this column will not add any value to the model.Hence for now , I will be dropping this column
# Due to lot of zero enteries this column is of not much use and will be ignored in rest of the model
_ = feature_vars.drop(['lights'], axis=1 , inplace= True) ;
feature_vars.tail()
import chart_studio.plotly as py
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
# To understand the timeseries variation of the applaince energy consumption
visData = go.Scatter( x= data.date , mode = "lines", y = data.Appliances )
layout = go.Layout(title = 'Appliance energy consumption pattern' , xaxis=dict(title='Date'), yaxis=dict(title='(Wh)'))
fig = go.Figure(data=[visData],layout=layout)
iplot(fig);
# Adding column to mark weekdays (0) and weekends(1) for time series evaluation ,
data['WEEKDAY'] = ((pd.to_datetime(data['date']).dt.dayofweek)// 5 == 1).astype(float)
# There are 5472 weekend recordings
data['WEEKDAY'].value_counts()
# Find rows with weekday
temp_weekday = data[data['WEEKDAY'] == 0]
# To understand the timeseries variation of the applaince energy consumption
visData = go.Scatter( x= temp_weekday.date , mode = "lines", y = temp_weekday.Appliances )
layout = go.Layout(title = 'Appliance energy consumption pattern on weekdays' , xaxis=dict(title='Date'), yaxis=dict(title='(Wh)'))
fig = go.Figure(data=[visData],layout=layout)
iplot(fig);
# Find rows with weekend
temp_weekend = data[data['WEEKDAY'] == 1]
# To understand the timeseries variation of the applaince energy consumption
visData = go.Scatter( x= temp_weekend.date , mode = "lines", y = temp_weekend.Appliances )
layout = go.Layout(title = 'Appliance energy consumption pattern on weekend' , xaxis=dict(title='Date'), yaxis=dict(title='(Wh)'))
fig = go.Figure(data=[visData],layout=layout)
iplot(fig);
feature_vars.hist(bins = 20 , figsize= (16,16)) ;
f, ax = plt.subplots(3,2,figsize=(16,16))
vis0 = sns.distplot(feature_vars["T9"],bins=10, ax= ax[0][0])
vis1 = sns.distplot(feature_vars["RH_6"],bins=10, ax= ax[0][1])
vis2 = sns.distplot(feature_vars["RH_out"],bins=10, ax=ax[1][0])
vis3 = sns.distplot(feature_vars["Visibility"],bins=10, ax=ax[1][1])
vis4 = sns.distplot(feature_vars["Windspeed"],bins=10, ax=ax[2][0])
#ax.flat[-1].set_visible(False) # to remove last plot extra plot
f.delaxes(ax[2][1])# to remove last plot extra plot
f = plt.figure(figsize=(12,5))
plt.xlabel('Appliance consumption in Wh')
plt.ylabel('Frequency')
sns.distplot(target_vars , bins=10 ) ;
1) Temperature - All the columns follow normal distribution except T9
2) Humidity - All columns follow normal distribution except RH_6 and RH_out ,as per my understanding it is because these sensors are outside the house
3) Appliance - This column is postively skewed , most the values are around mean 100 Wh . So I think that there are outliers in this column
4) Visibilty - This column is negatively skewed
5) Windspeed - This column is postively skewed
# Use the weather , temperature , applainces and random column to see the correlation
train_corr = train[col_temp + col_hum + col_weather +col_target+col_randoms]
corr = train_corr.corr()
# Masking the repeated values
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
f, ax = plt.subplots(figsize=(16, 14))
#Generating Heat Map
sns.heatmap(corr, annot=True, fmt=".2f" , mask=mask,)
plt.xticks(range(len(corr.columns)), corr.columns);
plt.yticks(range(len(corr.columns)), corr.columns)
plt.show();
1) Temperature - All the temperature variables from T1-T9 and T_out have positive correlation with the target Appliances . Four columns have a high degree of correlation with T9 - T3,T5,T7,T8 also T6 & T_Out has high correlation (both temperatures from outside) . Hence T6 & T9 can be removed from training set as information provided by them can be provided by other fields.
2) Weather attributes - Visibility, Tdewpoint, Press_mm_hg have low correlation values
3) Humidity - There are no significantly high correlation cases (> 0.9) for humidity sensors.
4) As expected Random variables have no role to play